
import json
import spacy
import textacy
import warnings
import requests
import wordcloud
import nltk.corpus
import pandas as pd
import IPython.display
import geopandas as gpd
import urllib.request as ur
import plotly.express as px
import plotly.subplots as ps
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import sklearn.feature_extraction.text
warnings.filterwarnings('ignore')
df = pd.read_csv("https://www-genesis.destatis.de/genesis/downloads/00/tables/13211-0007_00.csv",
delimiter=";", encoding="ISO-8859-1", skiprows=[0, 1, 2, 3, 5])
df = df.rename(columns={"Unnamed: 0": "State", "Unnamed: 1": "Year",
"Arbeitslosenquote aller zivilen Erwerbspersonen": "Unemplyement Rate (%)"})
df = df[df["Year"] >= 2000]
df["Year"] = df["Year"].astype(int)
df["Unemplyement Rate (%)"] = df["Unemplyement Rate (%)"].str.replace(",", ".")
df["Unemplyement Rate (%)"] = df["Unemplyement Rate (%)"].astype(float)
df_2022 = df[df["Year"] == 2022].copy()
germany_geojson = "https://raw.githubusercontent.com/isellsoap/deutschlandGeoJSON/main/2_bundeslaender/1_sehr_hoch.geo.json"
gdf = gpd.read_file(germany_geojson)
merged_data = gdf.merge(df_2022, left_on="name", right_on="State", how="left")
fig = px.choropleth(
merged_data,
geojson=merged_data["geometry"],
locations=merged_data.index,
color="Unemplyement Rate (%)",
hover_name="State",
color_continuous_scale="PuRd",
#title="Germany State Population",
#labels={"population": "Population"},
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(
title=dict(text=f"While Bayern is doing good,<br>Bremen with a 10% unemployment rate needs immediate action!",
font=dict(size=22))
)
fig.show()
df_bayern = df[df["State"] == "Bayern"]
df_bremen = df[df["State"] == "Bremen"]
fig = ps.make_subplots(rows=1, cols=2, subplot_titles=("Bayern", "Bremen"))
fig.add_trace(go.Scatter(x=df_bayern["Year"], y=df_bayern["Arbeitslose"], mode="lines+markers", name="Job Seekers",
marker_color="gray", line={"color": "lightgray", "width": 4}), row=1, col=1)
fig.add_trace(go.Scatter(x=df_bayern["Year"], y=df_bayern["Gemeldete Arbeitsstellen"], mode="lines+markers", name="Announced Jobs",
marker_color="blue", line={"color": "lightblue", "width": 5}), row=1, col=1)
fig.add_trace(go.Scatter(x=df_bremen["Year"], y=df_bremen["Arbeitslose"], mode="lines+markers", name="Job Seekers",
marker_color="gray", line={"color": "lightgray", "width": 4}, showlegend=False), row=1, col=2)
fig.add_trace(go.Scatter(x=df_bremen["Year"], y=df_bremen["Gemeldete Arbeitsstellen"], mode="lines+markers", name="Announced Jobs",
marker_color="blue", line={"color": "lightblue", "width": 5}, showlegend=False), row=1, col=2)
fig.update_layout(
title=dict(text=f"While the gap is being filled in Bayern, we should fill the gap for Bremen!", font=dict(size=25)),
xaxis_title="Year",
# yaxis_title="Energy Consumption (TJ)",
template="plotly_white",
legend=dict(yanchor="top", y=1.0, xanchor="left", x=0.3)
#showlegend=False
)
fig.add_annotation(dict(font=dict(color="red",size=15),
x=0.2,
y=0.28,
showarrow=False,
text="Job Seekers / Jobs = 1.5<br>3 Job Seekers: 2 Jobs",
textangle=0,
xanchor="left",
xref="paper",
yref="paper"))
fig.add_annotation(dict(font=dict(color="red",size=15),
x=0.8,
y=0.4,
showarrow=False,
text="Job Seekers / Jobs = 4.2<br>4 Job Seekers: 1 Job",
textangle=0,
xanchor="left",
xref="paper",
yref="paper"))
fig.show()
spacy.load("en_core_web_sm")
nlp = spacy.load("de_core_news_md")
stop_words = set(nltk.corpus.stopwords.words("german")) | set(nltk.corpus.stopwords.words("english"))
stp_s = {'€','gn', 'ab', 'de', 'm/w/d', 'd/m/w','m/f/d', 'f/m/d', 'w/m/d', 'm/f/x','teilzeit',
'vollzeit', 'eur', 'bremen', 'mitarbeiter','mitarbeiterin', 'genders'}
stop_words = stop_words.union(stp_s)
bad_chars = [';', ':', '!', "*", "(",")", "-"]
f = open('Final_Bremen.json')
data = json.load(f)
job_titles = []
# document = ''
for entry in data:
job_title = entry["job_title"].lower()
job_title = job_title.replace('(m/w/d)','')
job_title = job_title.replace('(d/m/w)','')
job_title = job_title.replace('(m/f/d)','')
job_title = job_title.replace('(f/m/d)','')
job_title = job_title.replace('(w/m/d)','')
job_title = job_title.replace('(m/f/x)','')
job_title = job_title.replace('m/w/d','')
token = job_title
token = ''.join(i for i in token if not i in bad_chars)
# token = token.replace('-', '');
token = token.split();
new_token = ' '.join([w for w in token if not w in stop_words]);
# if 'genders' in new_token:
# print(new_token)
job_titles.append(new_token)
# document = document + str(entry["job_title"].lower()) + ','
document = job_titles
text = ''
file1 = open('document.txt', 'w')
for item in document:
file1.write(item + '\n');
text = text + item
file1.close()
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(document)
tfidf_scores = tfidf_matrix.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()
job_tfidf_scores = {}
for i, position in enumerate(feature_names):
job_tfidf_scores[position] = sum(tfidf_scores[:, i]) / len(tfidf_scores[:, i])
sorted_job_positions = sorted(job_tfidf_scores.keys(), key=lambda x: job_tfidf_scores[x], reverse=True)
top_high_demand_positions = 20
result = ""
for position in sorted_job_positions[:top_high_demand_positions]:
# print(f"{position}: {job_tfidf_scores[position]}")
if position in ["sachbeart", "sachbearter", "aufgaben", "allgemeen", "nen", "sachgebietsleer", "referent", "in", "junior", "manager", "bereic", "senior", "controller"]:
continue
result += " "
result += " ".join([position] * int(job_tfidf_scores[position] * 500))
wc1 = wordcloud.WordCloud(background_color="white", width=800, height=400, colormap="magma").generate(result)
# wc1.to_image()
df = pd.read_csv('jobseeker_data.csv')
job_positions = df['Job Field']
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(job_positions)
tfidf_scores = tfidf_matrix.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()
job_tfidf_scores = {}
for i, position in enumerate(feature_names):
job_tfidf_scores[position] = sum(tfidf_scores[:, i]) / len(tfidf_scores[:, i])
sorted_job_positions = sorted(job_tfidf_scores.keys(), key=lambda x: job_tfidf_scores[x], reverse=True)
top_high_demand_positions = 20
result = ""
for position in sorted_job_positions[:top_high_demand_positions]:
# print(f"{position}: {job_tfidf_scores[position]}")
if position in ["sachbeart", "sachbearter", "im", "allgemeen", "nen", "sachgebietsleer", "referent", "und", "junior", "manager", "bereic", "senior", "controller"]:
continue
result += " "
result += " ".join([position] * int(job_tfidf_scores[position] * 1000))
wc2 = wordcloud.WordCloud(background_color="white", width=800, height=400).generate(result)
# wc2.to_image()
md = "### We should address the gap by matching available jobs and job seekers' preferences."
IPython.display.display(IPython.display.Markdown(md))
plt.subplots(figsize=(15, 10))
plt.subplot(1, 2, 1)
plt.imshow(wc1, interpolation="bilinear")
plt.axis("off") # Turn off axis labels and ticks
plt.title("Top Available Jobs \n in the Current Market", fontdict={'fontsize': 30})
plt.subplot(1, 2, 2)
plt.imshow(wc2, interpolation="bilinear")
plt.axis("off") # Turn off axis labels and ticks
plt.title("Top Job Seekers' \n Preferred Careers", fontdict={'fontsize': 30})
# Adjust layout for the subplots
plt.tight_layout()
# Show the plot
plt.show()